Load Libraries

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(naniar)
## 
## Attaching package: 'naniar'
## 
## The following object is masked from 'package:skimr':
## 
##     n_complete
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(broom)
library(countrycode)
library(ggrepel)

Data Wrangling Functions - Missing Values, Selecting Columns, Data Transformations, Feature Engineering etc

# Load Raw Data from source
load_energy_data <- function(url = "https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv") {
  read_csv(url)
}

# Filter out unnecessary data and high missing values
filter_data <- function(df) {
  df %>%
    filter(!str_starts(iso_code, "OWID"), !is.na(country), !is.na(year)) %>%
    filter(year >= 2000) %>%
    select(where(~ sum(is.na(.)) < 0.4 * nrow(df))) %>%
    filter(!is.na(population), !is.na(gdp))
}

# Select only necessary columns
select_energy_columns <- function(df) {
  df %>%
    select(
      country, iso_code, year, population, gdp,
      electricity_generation,
      renewables_electricity, fossil_electricity,
      solar_electricity, wind_electricity, hydro_electricity,
      renewables_share_elec, coal_share_elec, gas_share_elec, oil_share_elec
    )
}

# Add normalized metrics : based on population and GDP
add_normalized_metrics <- function(df) {
  df %>%
    mutate(
      electricity_per_capita = electricity_generation / population,
      renewables_per_capita = renewables_electricity / population,
      fossil_per_capita = fossil_electricity / population,
      electricity_per_gdp = electricity_generation / gdp,
      gdp_per_electricity = gdp / (electricity_generation + 1)
    )
}

# Log Transforms - Highly skewed variables like population, electricity generation 
add_log_transforms <- function(df) {
  df %>%
    mutate(
      log_gdp = log(gdp + 1),
      log_population = log(population + 1),
      log_electricity = log(electricity_generation + 1)
    )
}

# Check missing percentage
na_percentage <- function(df) {
  sapply(df, function(col) round(mean(is.na(col)) * 100, 2))
}

# Add energy ratios - gives a perspective of percentages
add_energy_ratios <- function(df) {
  df %>%
    mutate(
      fossil_to_renewable_ratio = fossil_electricity / (renewables_electricity + 1),
      fossil_share_elec = fossil_electricity / (electricity_generation + 1),
      solar_share = solar_electricity / (renewables_electricity + 1),
      wind_share = wind_electricity / (renewables_electricity + 1),
      hydro_share = hydro_electricity / (renewables_electricity + 1)
    )
}

add_classification_flags <- function(df) {
  df %>%
    mutate(
      high_renewable = if_else(renewables_share_elec > 50, 1, 0),
      transitioning = if_else(renewables_share_elec > fossil_share_elec, 1, 0)
    )
}

transform_energy_data <- function(df) {
  df %>%
    add_normalized_metrics() %>%
    add_log_transforms() %>%
    add_energy_ratios() %>%
    add_classification_flags()
}

Load Dataset 2 - Electricity Generation Data

# Load and process
energy_raw <- load_energy_data()
## Rows: 21812 Columns: 130
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (2): country, iso_code
## dbl (128): year, population, gdp, biofuel_cons_change_pct, biofuel_cons_chan...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#write_csv(energy_raw, "energy_raw.csv")
cat("Number of rows in Raw Dataset :",  nrow(energy_raw),'\n')
## Number of rows in Raw Dataset : 21812
cat("Number of columns in Raw Dataset :", ncol(energy_raw),'\n')
## Number of columns in Raw Dataset : 130
energy_clean <- filter_data(energy_raw)
cat("Number of rows in Filtered Dataset :",  nrow(energy_clean),'\n')
## Number of rows in Filtered Dataset : 3795
cat("Number of columns in Filtered Dataset :", ncol(energy_clean),'\n')
## Number of columns in Filtered Dataset : 130
energy_selected <- select_energy_columns(energy_clean)

cat("Number of columns in Dataset after dropping unnecessary columns:", ncol(energy_selected),'\n')
## Number of columns in Dataset after dropping unnecessary columns: 15
energy_transformed <- transform_energy_data(energy_selected)

# Add continent info
energy_transformed <- energy_transformed %>%
  mutate(continent = countrycode(country, "country.name", "continent"))

cat("Number of columns in Dataset after creating new columns and transformations:", ncol(energy_transformed),'\n')
## Number of columns in Dataset after creating new columns and transformations: 31

Check for Duplicates and Missing Values

### Checking for duplicated and % of missing values
energy_transformed %>% filter(duplicated(.))
### Checking for missing values
print("Percentage of Missing Values left :")
## [1] "Percentage of Missing Values left :"
na_percentage(energy_transformed)
##                   country                  iso_code                      year 
##                      0.00                      0.00                      0.00 
##                population                       gdp    electricity_generation 
##                      0.00                      0.00                      0.00 
##    renewables_electricity        fossil_electricity         solar_electricity 
##                      0.00                      0.00                      0.13 
##          wind_electricity         hydro_electricity     renewables_share_elec 
##                      0.00                      1.92                      0.13 
##           coal_share_elec            gas_share_elec            oil_share_elec 
##                      0.13                      0.71                      0.13 
##    electricity_per_capita     renewables_per_capita         fossil_per_capita 
##                      0.00                      0.00                      0.00 
##       electricity_per_gdp       gdp_per_electricity                   log_gdp 
##                      0.00                      0.00                      0.00 
##            log_population           log_electricity fossil_to_renewable_ratio 
##                      0.00                      0.00                      0.00 
##         fossil_share_elec               solar_share                wind_share 
##                      0.00                      0.13                      0.00 
##               hydro_share            high_renewable             transitioning 
##                      1.92                      0.13                      0.13 
##                 continent 
##                      0.00
#write_csv(energy_transformed, "energy_transformed.csv")

Some latest stats about countries about renewable electricity

# Filter for latest year
latest_year <- max(energy_transformed$year, na.rm = TRUE)
latest_data <- energy_transformed %>% filter(year == latest_year)

# Get top countries for different metrics
top_renewables <- latest_data %>% arrange(desc(renewables_electricity)) %>% slice(1)
top_generation <- latest_data %>% arrange(desc(electricity_generation)) %>% slice(1)
top_per_capita <- latest_data %>% arrange(desc(electricity_per_capita)) %>% slice(1)
top_gdp <- latest_data %>% arrange(desc(gdp)) %>% slice(1)
top_renew_share <- latest_data %>% arrange(desc(renewables_share_elec)) %>% slice(1)

# Print the results
cat(glue::glue("Year considered: {latest_year}\n"))
## Year considered: 2022
cat(glue::glue(" Country with most renewable electricity: {top_renewables$country} ({round(top_renewables$renewables_electricity, 2)} TWh)\n"))
## Country with most renewable electricity: China (2670.18 TWh)
cat(glue::glue(" Country with highest electricity generation: {top_generation$country} ({round(top_generation$electricity_generation, 2)} TWh)\n"))
## Country with highest electricity generation: China (8848.73 TWh)
cat(glue::glue(" Country with highest GDP: {top_gdp$country} (${format(round(top_gdp$gdp, 0), big.mark=',')})\n"))
## Country with highest GDP: China ($2.696602e+13)
cat(glue::glue(" Country with highest renewables share: {top_renew_share$country} ({round(top_renew_share$renewables_share_elec, 2)}%)\n"))
## Country with highest renewables share: Albania (100%)

Load CO2 and Clean Data - CO2 Emissions Dataset -1

co2_data <- read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv")
## Rows: 50191 Columns: 79
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): country, iso_code
## dbl (77): year, population, gdp, cement_co2, cement_co2_per_capita, co2, co2...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#write_csv(co2_data, "co2_raw.csv")

# Filter data for required years and missing values
co2_data = filter_data(co2_data)

# Select necessary columns
co2_data = co2_data %>% select(-c(cement_co2,cement_co2_per_capita,co2_growth_abs,co2_growth_prct,co2_including_luc_growth_abs,co2_including_luc_growth_prct,cumulative_cement_co2,cumulative_co2_including_luc,cumulative_luc_co2,flaring_co2,flaring_co2_per_capita,cumulative_flaring_co2,share_global_flaring_co2,share_global_cumulative_flaring_co2))

#COâ‚‚ emissions year-over-year change
co2_data = co2_data %>%
  group_by(country) %>%
  mutate(co2_pct_change = (co2 - lag(co2)) / lag(co2) * 100)

#Total COâ‚‚ from fossil fuel types 
co2_data = co2_data %>%
  mutate(fossil_fuel_co2 = coal_co2 + oil_co2 + gas_co2)

#log gdp
co2_data = co2_data %>% mutate(log_gdp = log(gdp))

#log co2
co2_data = co2_data %>% mutate(log_co2 = log(co2))

# write to csv
#write_csv(co2_transformed, "co2_transformed.csv")

Merge Two Datasets to Single one - Using Country and Year as keys

co2_transformed <- co2_data

# Repeated column names in both datasets
co2_transformed <- co2_transformed %>%
  select(-iso_code, -population, -gdp, -log_gdp)
# 

energy_co2_merged <- left_join(
  energy_transformed,
  co2_transformed,
  by = c("country", "year")
)


# Variable creation after merging datasets
get_co2_electricity_ratio <- function(df) {
  df %>%
    mutate(
      co2_per_kwh = co2 / electricity_generation)
}

energy_co2_merged <- get_co2_electricity_ratio(energy_co2_merged)

# Save dataset
#write_csv(energy_co2_merged, "/Users/manasamangipudi/Desktop/Semester-3/DataWrangling/Project/data/energy_co2_data_merged.csv")
head(energy_co2_merged)
summary(energy_co2_merged)
##    country            iso_code              year        population       
##  Length:3795        Length:3795        Min.   :2000   Min.   :6.817e+04  
##  Class :character   Class :character   1st Qu.:2005   1st Qu.:3.817e+06  
##  Mode  :character   Mode  :character   Median :2011   Median :1.004e+07  
##                                        Mean   :2011   Mean   :4.238e+07  
##                                        3rd Qu.:2017   3rd Qu.:2.929e+07  
##                                        Max.   :2022   Max.   :1.426e+09  
##                                                                          
##       gdp            electricity_generation renewables_electricity
##  Min.   :3.129e+08   Min.   :   0.00        Min.   :   0.00       
##  1st Qu.:2.354e+10   1st Qu.:   3.11        1st Qu.:   0.26       
##  Median :7.689e+10   Median :  15.37        Median :   3.09       
##  Mean   :5.690e+11   Mean   : 131.72        Mean   :  29.21       
##  3rd Qu.:3.505e+11   3rd Qu.:  66.06        3rd Qu.:  13.86       
##  Max.   :2.697e+13   Max.   :8848.73        Max.   :2670.18       
##                                                                   
##  fossil_electricity solar_electricity wind_electricity  hydro_electricity
##  Min.   :   0.00    Min.   :  0.000   Min.   :  0.000   Min.   :   0.00  
##  1st Qu.:   0.71    1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:   0.11  
##  Median :   5.96    Median :  0.000   Median :  0.000   Median :   2.13  
##  Mean   :  86.56    Mean   :  1.613   Mean   :  3.947   Mean   :  21.45  
##  3rd Qu.:  38.19    3rd Qu.:  0.050   3rd Qu.:  0.250   3rd Qu.:  10.43  
##  Max.   :5760.75    Max.   :427.720   Max.   :762.700   Max.   :1321.71  
##                     NA's   :5                           NA's   :73       
##  renewables_share_elec coal_share_elec  gas_share_elec   oil_share_elec    
##  Min.   :  0.000       Min.   :  0.00   Min.   :  0.00   Min.   :  0.0000  
##  1st Qu.:  4.942       1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.8562  
##  Median : 22.355       Median :  0.00   Median :  8.48   Median :  5.0365  
##  Mean   : 34.391       Mean   : 14.14   Mean   : 23.02   Mean   : 23.8598  
##  3rd Qu.: 60.269       3rd Qu.: 19.37   3rd Qu.: 37.74   3rd Qu.: 37.6215  
##  Max.   :100.000       Max.   :100.00   Max.   :100.00   Max.   :100.0000  
##  NA's   :5             NA's   :5        NA's   :27       NA's   :5         
##  electricity_per_capita renewables_per_capita fossil_per_capita  
##  Min.   :0.000e+00      Min.   :0.000e+00     Min.   :0.000e+00  
##  1st Qu.:4.482e-07      1st Qu.:3.516e-08     1st Qu.:1.278e-07  
##  Median :1.982e-06      Median :2.228e-07     Median :9.225e-07  
##  Mean   :3.805e-06      Mean   :1.263e-06     Mean   :2.187e-06  
##  3rd Qu.:5.117e-06      3rd Qu.:9.264e-07     3rd Qu.:2.987e-06  
##  Max.   :5.603e-05      Max.   :5.603e-05     Max.   :2.413e-05  
##                                                                  
##  electricity_per_gdp gdp_per_electricity    log_gdp      log_population 
##  Min.   :0.000e+00   Min.   :3.037e+08   Min.   :19.56   Min.   :11.13  
##  1st Qu.:1.023e-10   1st Qu.:3.162e+09   1st Qu.:23.88   1st Qu.:15.15  
##  Median :1.710e-10   Median :5.019e+09   Median :25.07   Median :16.12  
##  Mean   :2.059e-10   Mean   :6.044e+09   Mean   :25.19   Mean   :16.11  
##  3rd Qu.:2.520e-10   3rd Qu.:7.150e+09   3rd Qu.:26.58   3rd Qu.:17.19  
##  Max.   :1.996e-09   Max.   :3.968e+10   Max.   :30.93   Max.   :21.08  
##                                                                         
##  log_electricity fossil_to_renewable_ratio fossil_share_elec  solar_share      
##  Min.   :0.000   Min.   :  0.0000          Min.   :0.0000    Min.   :0.000000  
##  1st Qu.:1.413   1st Qu.:  0.2512          1st Qu.:0.1898    1st Qu.:0.000000  
##  Median :2.795   Median :  1.0400          Median :0.4588    Median :0.000000  
##  Mean   :2.909   Mean   :  7.0289          Mean   :0.4806    Mean   :0.027620  
##  3rd Qu.:4.206   3rd Qu.:  5.3100          3rd Qu.:0.8031    3rd Qu.:0.009275  
##  Max.   :9.088   Max.   :369.0841          Max.   :0.9973    Max.   :0.873275  
##                                                              NA's   :5         
##    wind_share       hydro_share      high_renewable   transitioning   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.06492   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.00000   Median :0.47789   Median :0.0000   Median :1.0000  
##  Mean   :0.05141   Mean   :0.45371   Mean   :0.3137   Mean   :0.8364  
##  3rd Qu.:0.03315   3rd Qu.:0.79907   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :0.79931   Max.   :0.99366   Max.   :1.0000   Max.   :1.0000  
##                    NA's   :73        NA's   :5        NA's   :5       
##   continent              co2            co2_including_luc  
##  Length:3795        Min.   :    0.048   Min.   :   -7.975  
##  Class :character   1st Qu.:    4.418   1st Qu.:   10.343  
##  Mode  :character   Median :   20.541   Median :   39.296  
##                     Mean   :  192.954   Mean   :  227.122  
##                     3rd Qu.:   90.416   3rd Qu.:  114.046  
##                     Max.   :11447.913   Max.   :11169.235  
##                     NA's   :23          NA's   :69         
##  co2_including_luc_per_capita co2_including_luc_per_gdp
##  Min.   :-2.829               Min.   :-0.3210          
##  1st Qu.: 1.586               1st Qu.: 0.2210          
##  Median : 3.925               Median : 0.3520          
##  Mean   : 5.798               Mean   : 0.6784          
##  3rd Qu.: 7.712               3rd Qu.: 0.6420          
##  Max.   :67.600               Max.   :20.7680          
##  NA's   :69                   NA's   :69               
##  co2_including_luc_per_unit_energy co2_per_capita     co2_per_gdp    
##  Min.   :-0.2050                   Min.   : 0.0220   Min.   :0.0320  
##  1st Qu.: 0.1860                   1st Qu.: 0.6987   1st Qu.:0.1480  
##  Median : 0.2450                   Median : 2.7300   Median :0.2280  
##  Mean   : 0.8546                   Mean   : 4.9073   Mean   :0.2831  
##  3rd Qu.: 0.5050                   3rd Qu.: 6.7192   3rd Qu.:0.3420  
##  Max.   :42.7180                   Max.   :67.5840   Max.   :2.1510  
##  NA's   :166                       NA's   :23        NA's   :23      
##  co2_per_unit_energy    coal_co2        coal_co2_per_capita consumption_co2    
##  Min.   :0.0290      Min.   :   0.000   Min.   : 0.000      Min.   :    0.084  
##  1st Qu.:0.1640      1st Qu.:   0.304   1st Qu.: 0.026      1st Qu.:   10.204  
##  Median :0.2050      Median :   3.359   Median : 0.326      Median :   43.264  
##  Mean   :0.2082      Mean   : 104.342   Mean   : 1.343      Mean   :  259.824  
##  3rd Qu.:0.2360      3rd Qu.:  27.986   3rd Qu.: 1.907      3rd Qu.:  167.065  
##  Max.   :1.6710      Max.   :8168.899   Max.   :13.952      Max.   :10400.611  
##  NA's   :120         NA's   :888        NA's   :888         NA's   :1068       
##  consumption_co2_per_capita consumption_co2_per_gdp cumulative_co2    
##  Min.   : 0.024             Min.   :0.002           Min.   :     1.4  
##  1st Qu.: 1.154             1st Qu.:0.193           1st Qu.:   113.0  
##  Median : 4.108             Median :0.269           Median :   728.1  
##  Mean   : 6.351             Mean   :0.302           Mean   :  8270.0  
##  3rd Qu.: 9.630             3rd Qu.:0.387           3rd Qu.:  3738.3  
##  Max.   :47.559             Max.   :1.293           Max.   :426941.3  
##  NA's   :1068               NA's   :1068            NA's   :23        
##  cumulative_coal_co2 cumulative_gas_co2 cumulative_oil_co2 
##  Min.   :     0.00   Min.   :    0.00   Min.   :     1.39  
##  1st Qu.:    14.95   1st Qu.:   34.58   1st Qu.:    80.79  
##  Median :   181.83   Median :  219.74   Median :   334.06  
##  Mean   :  5243.99   Mean   : 1617.69   Mean   :  2771.53  
##  3rd Qu.:  1590.84   3rd Qu.: 1011.18   3rd Qu.:  1543.16  
##  Max.   :195480.73   Max.   :80314.20   Max.   :163519.67  
##  NA's   :888         NA's   :1149       NA's   :23         
##  cumulative_other_co2 energy_per_capita  energy_per_gdp       gas_co2        
##  Min.   :   0.017     Min.   :   105.1   Min.   : 0.0780   Min.   :   0.000  
##  1st Qu.:   9.877     1st Qu.:  3416.6   1st Qu.: 0.8375   1st Qu.:   1.799  
##  Median :  26.866     Median : 13904.6   Median : 1.1740   Median :  10.374  
##  Mean   : 152.141     Mean   : 26824.9   Mean   : 1.4287   Mean   :  54.764  
##  3rd Qu.:  83.343     3rd Qu.: 35500.2   3rd Qu.: 1.7995   3rd Qu.:  50.954  
##  Max.   :5071.090     Max.   :263541.6   Max.   :10.1410   Max.   :1748.499  
##  NA's   :2760         NA's   :120        NA's   :120       NA's   :1149      
##  gas_co2_per_capita ghg_excluding_lucf_per_capita ghg_per_capita   
##  Min.   : 0.0000    Min.   :  0.225               Min.   :  0.552  
##  1st Qu.: 0.1430    1st Qu.:  1.192               1st Qu.:  3.058  
##  Median : 0.8525    Median :  3.652               Median :  5.980  
##  Mean   : 2.2055    Mean   :  6.660               Mean   :  8.599  
##  3rd Qu.: 2.1150    3rd Qu.:  8.338               3rd Qu.: 10.336  
##  Max.   :42.8220    Max.   :120.443               Max.   :120.802  
##  NA's   :1149       NA's   :69                    NA's   :46       
##  land_use_change_co2 land_use_change_co2_per_capita    methane        
##  Min.   :-286.0790   Min.   :-6.6750                Min.   :   0.052  
##  1st Qu.:  -0.1952   1st Qu.:-0.0500                1st Qu.:   5.203  
##  Median :   2.1440   Median : 0.2255                Median :  15.968  
##  Mean   :  33.6902   Mean   : 0.9370                Mean   :  57.109  
##  3rd Qu.:  18.3550   3rd Qu.: 1.3215                3rd Qu.:  46.815  
##  Max.   :2805.2370   Max.   :27.9400                Max.   :1864.384  
##  NA's   :69          NA's   :69                     NA's   :46        
##  methane_per_capita nitrous_oxide     nitrous_oxide_per_capita
##  Min.   : 0.244     Min.   :  0.010   Min.   :0.0280          
##  1st Qu.: 0.826     1st Qu.:  1.377   1st Qu.:0.2260          
##  Median : 1.159     Median :  4.563   Median :0.3390          
##  Mean   : 2.257     Mean   : 16.250   Mean   :0.5374          
##  3rd Qu.: 1.924     3rd Qu.: 12.165   3rd Qu.:0.5670          
##  Max.   :57.888     Max.   :457.125   Max.   :5.3980          
##  NA's   :46         NA's   :46        NA's   :46              
##     oil_co2         oil_co2_per_capita other_co2_per_capita other_industry_co2
##  Min.   :   0.048   Min.   : 0.013     Min.   :0.0010       Min.   :  0.0000  
##  1st Qu.:   2.589   1st Qu.: 0.347     1st Qu.:0.0535       1st Qu.:  0.4575  
##  Median :   9.248   Median : 1.220     Median :0.0770       Median :  1.1580  
##  Mean   :  62.562   Mean   : 2.045     Mean   :0.0921       Mean   :  5.9598  
##  3rd Qu.:  35.446   3rd Qu.: 2.769     3rd Qu.:0.1250       3rd Qu.:  4.0020  
##  Max.   :2642.556   Max.   :22.950     Max.   :0.3020       Max.   :177.2570  
##  NA's   :23         NA's   :23         NA's   :2760         NA's   :2760      
##  primary_energy_consumption share_global_cement_co2 share_global_co2 
##  Min.   :    0.27           Min.   : 0.0000         Min.   : 0.0000  
##  1st Qu.:   28.65           1st Qu.: 0.0150         1st Qu.: 0.0140  
##  Median :  100.87           Median : 0.0610         Median : 0.0620  
##  Mean   :  881.92           Mean   : 0.6235         Mean   : 0.5895  
##  3rd Qu.:  485.16           3rd Qu.: 0.2400         3rd Qu.: 0.2670  
##  Max.   :44516.31           Max.   :52.1450         Max.   :31.0470  
##  NA's   :120                NA's   :110             NA's   :23       
##  share_global_co2_including_luc share_global_coal_co2
##  Min.   :-0.0240                Min.   : 0.0000      
##  1st Qu.: 0.0270                1st Qu.: 0.0020      
##  Median : 0.1040                Median : 0.0240      
##  Mean   : 0.6051                Mean   : 0.7908      
##  3rd Qu.: 0.3140                3rd Qu.: 0.2080      
##  Max.   :27.6250                Max.   :53.8270      
##  NA's   :69                     NA's   :888          
##  share_global_cumulative_cement_co2 share_global_cumulative_co2
##  Min.   : 0.0000                    Min.   : 0.0000            
##  1st Qu.: 0.0110                    1st Qu.: 0.0090            
##  Median : 0.0760                    Median : 0.0530            
##  Mean   : 0.6237                    Mean   : 0.5949            
##  3rd Qu.: 0.3960                    3rd Qu.: 0.2582            
##  Max.   :34.9010                    Max.   :29.0640            
##  NA's   :110                        NA's   :23                 
##  share_global_cumulative_co2_including_luc share_global_cumulative_coal_co2
##  Min.   : 0.0000                           Min.   : 0.0000                 
##  1st Qu.: 0.0330                           1st Qu.: 0.0020                 
##  Median : 0.1140                           Median : 0.0270                 
##  Mean   : 0.6361                           Mean   : 0.7908                 
##  3rd Qu.: 0.3100                           3rd Qu.: 0.2420                 
##  Max.   :24.5990                           Max.   :26.4540                 
##  NA's   :69                                NA's   :888                     
##  share_global_cumulative_gas_co2 share_global_cumulative_luc_co2
##  Min.   : 0.0000                 Min.   :-0.5580                
##  1st Qu.: 0.0200                 1st Qu.: 0.0230                
##  Median : 0.1170                 Median : 0.1300                
##  Mean   : 0.8685                 Mean   : 0.7841                
##  3rd Qu.: 0.5258                 3rd Qu.: 0.4908                
##  Max.   :40.8490                 Max.   :17.8810                
##  NA's   :1149                    NA's   :69                     
##  share_global_cumulative_oil_co2 share_global_cumulative_other_co2
##  Min.   : 0.0000                 Min.   : 0.000                   
##  1st Qu.: 0.0160                 1st Qu.: 0.176                   
##  Median : 0.0690                 Median : 0.377                   
##  Mean   : 0.5681                 Mean   : 2.222                   
##  3rd Qu.: 0.3172                 3rd Qu.: 1.230                   
##  Max.   :30.4600                 Max.   :50.697                   
##  NA's   :23                      NA's   :2760                     
##  share_global_gas_co2 share_global_luc_co2 share_global_oil_co2
##  Min.   : 0.0000      Min.   :-8.0790      Min.   : 0.0000     
##  1st Qu.: 0.0280      1st Qu.:-0.0040      1st Qu.: 0.0230     
##  Median : 0.1660      Median : 0.0460      Median : 0.0810     
##  Mean   : 0.8684      Mean   : 0.6981      Mean   : 0.5523     
##  3rd Qu.: 0.8222      3rd Qu.: 0.3857      3rd Qu.: 0.3210     
##  Max.   :26.3460      Max.   :46.1720      Max.   :24.2220     
##  NA's   :1149         NA's   :69           NA's   :23          
##  share_global_other_co2 share_of_temperature_change_from_ghg
##  Min.   : 0.000         Min.   : 0.0000                     
##  1st Qu.: 0.172         1st Qu.: 0.0550                     
##  Median : 0.434         Median : 0.1500                     
##  Mean   : 2.222         Mean   : 0.6048                     
##  3rd Qu.: 1.543         3rd Qu.: 0.3920                     
##  Max.   :58.136         Max.   :19.9580                     
##  NA's   :2760           NA's   :23                          
##  temperature_change_from_ch4 temperature_change_from_co2
##  Min.   :-0.00100            Min.   :0.000000           
##  1st Qu.: 0.00000            1st Qu.:0.000000           
##  Median : 0.00100            Median :0.001000           
##  Mean   : 0.00217            Mean   :0.005752           
##  3rd Qu.: 0.00200            3rd Qu.:0.003000           
##  Max.   : 0.06300            Max.   :0.239000           
##  NA's   :46                  NA's   :23                 
##  temperature_change_from_ghg temperature_change_from_n2o   total_ghg        
##  Min.   :0.000000            Min.   :0.00000             Min.   :    0.089  
##  1st Qu.:0.001000            1st Qu.:0.00000             1st Qu.:   21.915  
##  Median :0.002000            Median :0.00000             Median :   60.260  
##  Mean   :0.008391            Mean   :0.00033             Mean   :  296.246  
##  3rd Qu.:0.005000            3rd Qu.:0.00000             3rd Qu.:  199.122  
##  Max.   :0.285000            Max.   :0.01100             Max.   :13427.619  
##  NA's   :23                  NA's   :46                  NA's   :46         
##  total_ghg_excluding_lucf   trade_co2          trade_co2_share  
##  Min.   :    0.099        Min.   :-1532.0800   Min.   :-98.849  
##  1st Qu.:    8.705        1st Qu.:   -1.3195   1st Qu.: -4.922  
##  Median :   33.772        Median :    2.3090   Median : 12.467  
##  Mean   :  235.324        Mean   :    0.0553   Mean   : 26.469  
##  3rd Qu.:  121.294        3rd Qu.:    9.7400   3rd Qu.: 38.595  
##  Max.   :13012.948        Max.   :  654.1420   Max.   :568.635  
##  NA's   :69               NA's   :1068         NA's   :1068     
##  co2_pct_change    fossil_fuel_co2        log_co2        co2_per_kwh     
##  Min.   :-55.097   Min.   :    0.689   Min.   :-3.037   Min.   :0.06741  
##  1st Qu.: -2.441   1st Qu.:   17.125   1st Qu.: 1.486   1st Qu.:1.13940  
##  Median :  1.828   Median :   55.835   Median : 3.022   Median :1.47016  
##  Mean   :  2.817   Mean   :  290.352   Mean   : 2.982   Mean   :    Inf  
##  3rd Qu.:  6.893   3rd Qu.:  209.087   3rd Qu.: 4.504   3rd Qu.:2.07686  
##  Max.   :126.935   Max.   :10427.785   Max.   : 9.346   Max.   :    Inf  
##  NA's   :187       NA's   :1477        NA's   :23       NA's   :23

Exploratory Data Analysis

CO2 Dataset

CO2 Emissions vs Population - How GDP impacts it

ggplot(energy_co2_merged %>% filter(year==2022), aes(x = population, y = co2, size = gdp, label = country)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_text_repel(max.overlaps = 5, size = 5) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = "Fossil Fuel CO2 Emissions vs Population (Bubble Size = GDP)",
    x = "Population",
    y = "CO2 Emissions (log scale)",
    size = "GDP"
  ) +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 155 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

CO2 Emissions of Global Super Powers

ggplot(energy_co2_merged %>% filter(country %in% c("United States","India","China","Germany","Brazil")),aes(x=year,y=co2,color = country))+
  geom_line(size=1)+
  labs(title = "CO2 Emisions Over Time for Global Super Powers ",y="CO2 (Million Tonnes)")+
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Paris Agreement Effect on CO2 Emissions

co2_change <- energy_co2_merged %>%
  filter(year %in% c(2015, 2022)) %>%
  select(country, year, co2,iso_code) %>%
  pivot_wider(names_from = year, values_from = co2, names_prefix = "co2_") %>%
  mutate(co2_diff_2015_2022 = co2_2022 - co2_2015)

co2_change %>%
  drop_na(co2_diff_2015_2022,iso_code) %>%
  arrange(co2_diff_2015_2022) %>%
  slice(1:15) %>%
  ggplot(aes(x = reorder(country, co2_diff_2015_2022,decreasing = TRUE), y = co2_diff_2015_2022)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Countries with Most Decrease in CO2 Emissions (2015–2022)",
    x = "Country",
    y = "Change in CO2 Emissions (Mt)"
  ) +
  theme_minimal()

Electricity Generation Dataset :

Electricity Generation over Time by Continents

ggplot(energy_co2_merged, aes(x = continent, y = electricity_generation)) +
  geom_boxplot(fill = "steelblue") +
  theme_minimal() +
  labs(title = "Electricity by Continent")

ggplot(energy_co2_merged, aes(x = continent, y = electricity_per_capita)) +
  geom_boxplot(fill = "steelblue") +
  theme_minimal() +
  labs(title = "Electricity per Capita by Continent")

energy_co2_merged %>%
  group_by(continent, year) %>%
  summarise(
    coal = sum(coal_share_elec, na.rm = TRUE),
    gas = sum(gas_share_elec, na.rm = TRUE),
    oil = sum(oil_share_elec, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c("coal", "gas","oil"), names_to = "source", values_to = "electricity") %>%
  ggplot(aes(x = year, y = electricity, color = source)) +
  geom_line(size = 1) +
  facet_wrap(~continent, scales = "free_y") +
  theme_minimal() +
  labs(title = "Growth of Coal, Gas, Oil by Continent", y = "Electricity (TWh)")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.

energy_co2_merged %>%
  group_by(continent, year) %>%
  summarise(
    wind = sum(wind_share, na.rm = TRUE),
    solar = sum(solar_share, na.rm = TRUE),
    hydro = sum(hydro_share, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c("wind", "solar","hydro"), names_to = "source", values_to = "electricity") %>%
  ggplot(aes(x = year, y = electricity, color = source)) +
  geom_line(size = 1) +
  facet_wrap(~continent, scales = "free_y") +
  theme_minimal() +
  labs(title = "Growth of Wind, Solar, Hydro by Continent", y = "Electricity (TWh)")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.

Comparing Renewable and Fossil-fuel based Electricity Generation

energy_by_continent <- energy_transformed %>%
  group_by(continent, year) %>%
  summarise(
    renewables = sum(renewables_electricity, na.rm = TRUE),
    fossil = sum(fossil_electricity, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c("renewables", "fossil"), names_to = "source", values_to = "electricity") %>%
  ungroup()
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
ggplot(energy_by_continent, aes(x = year, y = electricity, color = source)) +
  geom_line(size = 1) +
  facet_wrap(~continent, scales = "free_y") +
  theme_minimal() +
  labs(title = "Renewable vs Non-Renewable Electricity Generation by Continent Over Time")

Electricity vs GDP for year 2022 - Size representing Fossil_and_Renewable Ratio

ggplot(energy_co2_merged %>% filter(year==2022), aes(x = log_gdp, y = electricity_generation, size = fossil_to_renewable_ratio, label = country)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_text_repel(max.overlaps = 10, size = 5) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  theme_minimal()
## Warning: ggrepel: 155 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Linear Relationship between GDP and Electricity Generation

Merged Dataset - EDA

Fossil Electricity and CO2 Emissions Linear Relationship

energy_co2_merged %>%
  group_by(continent, year) %>%
  summarise(
    fossil_elec = mean(fossil_electricity, na.rm = TRUE),
    co2 = mean(co2, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = fossil_elec, y = co2)) +
  geom_point(alpha = 0.6, color = "firebrick") +
  geom_smooth(method = "lm", color = "black") +
  facet_wrap(~continent, scales = "free") +
  labs(
    title = "Fossil Electricity vs CO2 Emissions by Continent",
    x = "Fossil Electricity (TWh)",
    y = "CO2 Emissions (Mt)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

CO2 Emissions per unit of electricity generated over time

energy_co2_merged %>%
  mutate(co2_per_twh = co2 /electricity_generation) %>%
  group_by(continent, year) %>%
  summarise(
    co2_efficiency = mean(co2_per_twh, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = year, y = co2_efficiency, color = continent)) +
  geom_line(size = 1.2) +
  labs(
    title = "COâ‚‚ Emissions per Unit of Electricity Generated",
    y = "COâ‚‚ / TWh",
    x = "Year"
  ) +
  theme_minimal()

GHG Emissions per unit of electricity generated over time

energy_co2_merged %>%
  mutate(ghg_per_twh = total_ghg /electricity_generation) %>%
  group_by(continent, year) %>%
  summarise(
    ghg_efficiency = mean(ghg_per_twh, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = year, y = ghg_efficiency, color = continent)) +
  geom_line(size = 1.2) +
  labs(
    title = "GHG Emissions per Unit of Electricity Generated",
    y = "GHG / TWh",
    x = "Year"
  ) +
  theme_minimal()

Linear regression of CO2 against every other column sorted by estimate

# 1. Filter for United States
us_data <- energy_co2_merged %>%
  filter(country == "United States") %>%
  select(-country, -iso_code, -continent, year)  # Remove non-numeric/grouping vars

# 2. Remove columns with all NA or zero variance
us_data <- us_data %>%
  select(where(is.numeric)) %>%
  select(where(~ sum(!is.na(.)) > 0)) %>%
  select(where(~ sd(., na.rm = TRUE) > 0))

# 3. Build models and extract p-values
results <- map_dfr(
  setdiff(names(us_data), "co2"),
  function(var) {
    df <- us_data %>% select(co2, !!sym(var)) %>% drop_na()
    if (nrow(df) < 10) return(NULL)  # skip if not enough data
    model <- lm(co2 ~ ., data = df)
    tidy(model) %>%
      filter(term != "(Intercept)") %>%
      mutate(variable = var)
  }
)

# 4. Print significant predictors (p >= 0.05)
significant <- results %>%
  filter(p.value < 0.05) %>%
  arrange(p.value)

head(significant %>% arrange(desc(estimate)))

Fossil Fuel vs CO2 Emissions in 2022 (By Population)

library(ggrepel)
ggplot(energy_co2_merged %>% filter(year==2022), aes(x = renewables_per_capita, y = co2, size = population, label = country)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_text_repel(max.overlaps = 5, size = 4) +
  scale_x_log10() +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = "Renewable Elec vs Co2 Emissions (Bubble Size = Population)",
    x = "Renewables_per_capita",
    y = "CO2 Emissions (log scale)",
    size = "Population"
  ) +
  theme_minimal()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 134 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Global Scatter Plot: CO2 per Capita vs Renewables Share
energy_co2_merged <- energy_co2_merged %>%
  mutate(co2_per_capita = co2 / population)

ggplot(energy_co2_merged, aes(x = renewables_share_elec, y = log(co2_per_capita))) +
  geom_point(alpha = 0.4, color = "darkgreen") +
  geom_smooth(method = "lm", color = "red") +
  labs(
    title = "Relationship between Renewable Electricity Share and CO2 Emissions per Capita",
    x = "Renewables Share (%)",
    y = "Log CO2 Emissions per Capita (tons)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Global Regression Model
model_global <- lm(co2_per_capita ~ renewables_share_elec + gdp, data = energy_co2_merged)
summary(model_global)
## 
## Call:
## lm(formula = co2_per_capita ~ renewables_share_elec + gdp, data = energy_co2_merged)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -9.362e-06 -3.485e-06 -8.430e-07  1.318e-06  6.067e-05 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.811e-06  1.471e-07  46.314   <2e-16 ***
## renewables_share_elec -6.285e-08  2.958e-09 -21.248   <2e-16 ***
## gdp                    4.601e-19  5.126e-20   8.976   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.028e-06 on 3764 degrees of freedom
##   (28 observations deleted due to missingness)
## Multiple R-squared:  0.1335, Adjusted R-squared:  0.133 
## F-statistic: 289.9 on 2 and 3764 DF,  p-value: < 2.2e-16
top5 <- c("Germany", "India", "Brazil", "United States", "China")

energy_co2_merged %>%
  filter(country %in% top5) %>%
  ggplot(aes(x = year, y = co2 / population, color = country)) +
  geom_line(size = 1) +
  labs(
    title = "CO2 Emissions per Capita Over Time (Top 5 Economies)",
    y = "CO2 per Capita (tons)",
    x = "Year"
  ) +
  theme_minimal()

top5 <- c("United States", "China", "India", "Germany", "Brazil")

energy_co2_merged %>%
  filter(country %in% top5) %>%
  ggplot(aes(x = year, y = renewables_share_elec, color = country)) +
  geom_line(size = 1) +
  labs(
    title = "Renewable Electricity Share Over Time (Top 5 Economies)",
    y = "Renewables Share (%)",
    x = "Year"
  ) +
  theme_minimal()

energy_co2_merged %>%
  group_by(year) %>%
  summarise(avg_co2_per_kwh = mean(co2 / electricity_generation, na.rm = TRUE)) %>%
  ggplot(aes(x = year, y = avg_co2_per_kwh)) +
  geom_line(color = "firebrick", size = 1.2) +
  labs(
    title = "Global Average CO2 Emissions per kWh Over Time",
    x = "Year",
    y = "CO2 per kWh"
  ) +
  theme_minimal()

energy_co2_merged %>%
  filter(year == max(year)) %>%
  mutate(co2_per_kwh = co2 / electricity_generation) %>%
  ggplot(aes(x = renewables_share_elec, y = co2_per_kwh)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Renewables Share vs COâ‚‚ per kWh (Latest Year)",
    x = "Renewables Share (%)",
    y = "COâ‚‚ per kWh"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

top5 <- c("United States", "China", "India", "Germany", "Brazil")

energy_co2_merged %>%
  filter(country %in% top5) %>%
  mutate(co2_per_kwh = co2 / electricity_generation) %>%
  filter(!is.na(co2_per_kwh), co2_per_kwh < 5) %>%  # Optional: filter extreme values
  ggplot(aes(x = year, y = co2_per_kwh, color = country)) +
  geom_line(size = 1.2) +
  labs(
    title = "CO2 Emissions per kWh Over Time (Top 5 Economies)",
    x = "Year",
    y = expression(CO[2]~"per kWh"),
    color = "Country"
  ) +
  theme_minimal()

library(ggrepel)
ggplot(energy_co2_merged %>% filter(year==2022), aes(x =renewables_share_elec , y = co2_per_kwh, size = population, label = country)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_text_repel(max.overlaps = 2, size = 4) +
  scale_x_log10() +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = "Renewable Elec vs Co2 Emissions (Bubble Size = Population)",
    x = "Renewables_per_capita",
    y = "CO2 Emissions (log scale)",
    size = "Population"
  ) +
  theme_minimal()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 148 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps